import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set()
from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import kaleido
import statsmodels.stats.api as sms
import statsmodels.api as sm
from matplotlib import rcParams
import statsmodels.formula.api as smf
from statsmodels.nonparametric.smoothers_lowess import lowess
import scipy.stats as stat
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
path_data = r'D:\ECM\DigitalLab\Projet_2\final_data\data.csv'
data = pd.read_csv(path_data)
data = data.drop(['Unnamed: 0'], axis = 1)
path_data = r'D:\ECM\DigitalLab\Projet_2\final_data\data_analysis.csv'
data3 = pd.read_csv(path_data)
data3 = data.drop(['Unnamed: 0'], axis = 1)
data.head()
| id_projet | Nombre_pret | MT_ACCORDE_PRET_PAR_CR | MT_FINCT_GLOBAL | MT_CRD_REEL | MT_CRD_THRQ | MT_DEBLOQUE_REALISATION | MT_UTILISE | MT_PROCH_ECHCE_THRQ | DT_DECI | ... | SURFACE_FINANCIERE | SURFACE_FINANCIERE_M6 | mt_paiement_carte_12m | nb_paiement_carte_12m | period | Nombre_ass | BARCODE | DD_HISTO | DIF_YEAR | CA_REMISE_T | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ++53bn | 3 | 39800.0 | 171837.0 | 12204.10 | 12204.10 | 39800.0 | 39800.0 | 297.13 | 2011 | ... | 6795.99 | 10698.49 | -13196.00 | 372.0 | 2019 | 1 | 0 | 2011 | 10.4 | 611.34 |
| 1 | ++53bn | 3 | 39800.0 | 171837.0 | 5587.20 | 5587.20 | 39800.0 | 39800.0 | 297.13 | 2011 | ... | 9187.53 | 9036.20 | -17890.44 | 359.0 | 2021 | 1 | 0 | 2011 | 10.4 | 611.34 |
| 2 | ++ECFZ | 2 | 120340.0 | 131342.0 | 114949.53 | 114949.53 | 120340.0 | 120340.0 | 478.40 | 2018 | ... | 4189.09 | 3053.43 | -7924.58 | 325.0 | 2019 | 1 | 0 | 2018 | 3.4 | 5006.14 |
| 3 | ++ECFZ | 2 | 120340.0 | 131342.0 | 110839.63 | 110839.63 | 120340.0 | 120340.0 | 478.40 | 2018 | ... | 2884.52 | 5522.47 | -9711.20 | 328.0 | 2020 | 1 | 0 | 2018 | 3.4 | 5006.14 |
| 4 | ++ECFZ | 2 | 120340.0 | 131342.0 | 107020.85 | 107020.85 | 120340.0 | 120340.0 | 478.40 | 2018 | ... | 3216.22 | 16445.84 | -8963.81 | 308.0 | 2021 | 1 | 0 | 2018 | 3.4 | 5006.14 |
5 rows × 173 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 204534 entries, 0 to 204533 Columns: 173 entries, id_projet to CA_REMISE_T dtypes: float64(151), int64(9), object(13) memory usage: 270.0+ MB
data2 = data.copy()
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
columns_encoding =['LI_DESTN_FINCT','LI_BIEN_FINANCE','SEGMT','LI_REGRP_CSP','NIVEAU_RISQUE','type_ptf_cc','DT_SIGNER_CTR_CRED',
'type_residence_1','situ_globale', 'DD_HISTO', 'DT_1ERE_UTIL', 'DT_DECI' ]
id_projet = data2['id_projet']
data2 = data2.drop(['id_projet'], axis = 1)
LE = {col : LabelEncoder() for col in columns_encoding}
for col in columns_encoding :
data2[col] = LE[col].fit_transform(data2[col])
data2 = data2.drop(['LI_DOMN_INTVTN','ID_EDS5','LI_ADR_6','TYPE_PTF'], axis = 1)
barecode = data2['BARCODE']
data2 = data2.drop(['BARCODE'], axis = 1)
correlated_features = set()
correlation_matrix = data2.corr()
for i in range(len(correlation_matrix.columns)):
for j in range(i):
if abs(correlation_matrix.iloc[i, j]) > 0.8:
colname = correlation_matrix.columns[i]
correlated_features.add(colname)
len(correlated_features)
66
correlated_features = list(correlated_features)
stay = ['DD_HISTO','DIF_YEAR','DT_SIGNER_CTR_CRED','DUREE_RESTANTE','NB_CONX_WEB_12M']
correlated_features = [e for e in correlated_features if e not in stay]
for col in columns_encoding :
data2[col] = LE[col].inverse_transform(data2[col])
data2 = data2.drop(list(correlated_features), axis = 1)
data2['BARCODE'] = barecode
data2['id_projet'] = id_projet
image_path = 'D:/ECM/DigitalLab/Projet_2/data2/images/fig_'
color = ['rgb(26, 140, 255)', 'rgb(77, 77, 255)', 'rgb(0, 0, 179)']
code = [0, 1, 2]
def transform(x, data, col) :
if x < np.quantile(data[col], 0.1) :
x = (np.quantile(data[col], 0.1))/(1.5)
return x
elif x > np.quantile(data[col], 0.9) :
x = (np.quantile(data[col], 0.9))*(1.5)
return x
else :
return x
def graph(data) :
all_columns = data.columns
data3 = data.copy()
for col in all_columns :
if data[col].dtype == 'int' :
x,y = col, 'BARCODE'
df1 = data.groupby(y)[x].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('num').reset_index()
df1['num']=df1['num'].apply(lambda x:round(x,2))
df1[x] = df1[x].astype(str)
fig = px.bar(df1, x=y, y='num', color = x, text='num',color_discrete_sequence=color)
fig.write_image(image_path + col +".png")
fig.write_html(image_path + col +".html")
elif data[col].dtype == 'float' :
if 'TP_' in col :
x,y = col, 'BARCODE'
df1 = data.groupby(y)[x].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('num').reset_index()
df1['num']=df1['num'].apply(lambda x:round(x,2))
df1[x] = df1[x].astype(str)
fig = px.bar(df1, x=y, y='num', color = x, text='num',color_discrete_sequence=color)
fig.write_image(image_path + col +".png")
fig.write_html(image_path + col +".html")
else :
fig = px.box(data, x="period", y=col, color="BARCODE", color_discrete_sequence=color)
fig.write_image(image_path + col + "_1.png")
fig.write_html(image_path + col +"_1.html")
DATA = []
for c in code :
ex = data[data['BARCODE'] == c ]
ex[col] = ex[col].apply(lambda x : transform(x, ex, col))
DATA.append(ex)
data3 = pd.concat(DATA)
fig = px.box(data3, x="period", y=col, color="BARCODE", color_discrete_sequence=color)
fig.write_image(image_path + col + "_2.png")
fig.write_html(image_path + col +"_2.html")
elif data3[col].dtype == 'O' :
x,y = col, 'BARCODE'
df1 = data.groupby(y)[x].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('num').reset_index()
df1['num']=df1['num'].apply(lambda x:round(x,2))
df1[x] = df1[x].astype(str)
fig = px.bar(df1, x=y, y='num', color = x, text='num',color_discrete_sequence=color)
fig.write_image(image_path + col +".png")
fig.write_html(image_path + col +".html")
return data3
columns4 = ['NB_CONX_WEB_12M','SLD_MOY_CRDT_12_MOIS','PNB', 'AGE','ANCIENNETE_MOIS','TX_CRED','DUREE_RESTANTE',
'MT_ACCORDE_PRET_PAR_CR', 'CA_REMISE_T']
#for col in columns4 :
# DATA = []
# for c in code :
# ex = data2[data2['BARCODE'] == c ]
# ex[col] = ex[col].apply(lambda x : transform(x, ex, col))
# DATA.append(ex)
# data3 = pd.concat(DATA)
all_columns3 = ['id_projet', 'Nombre_pret', 'CA_REMISE_T', 'situ_globale','type_residence_1','NB_PARTENAIRES','LI_DESTN_FINCT',
'nb_mois_dernier_entretien', 'type_ptf_cc', 'PNB', 'TP_VU_M12','NB_CONX_BAM_12M', 'NB_CONX_WEB_12M',
'MT_ENC_EP_BILAN','MT_ENC_COLLECTE','NB_MVT_DBTR__DAV_12M','SLD_COMPTA_DAV','SLD_MOY_DBIT_12_MOIS',
'SLD_MOY_CRDT_12_MOIS','N_TP_CREDIT_IMMO','TP_CREDIT_CONSO','N_TP_EPARGNE_ASS_VIE','N_TP_PROJET',
'TP_EPARGNE_BILANTIELLE','TP_GDECES', 'TP_PJ','TP_PROTECTION_PERSONNES','TP_AUTO','TP_PROTECTION_BIENS',
'TP_ASSURANCE', 'N_TP_CARTES', 'LI_REGRP_CSP','SEGMT','AGE','ANCIENNETE_MOIS','TX_CRED','DUREE_RESTANTE',
'MT_ACCORDE_PRET_PAR_CR','MT_PROCH_ECHCE_THRQ','MT_VERSE_EPRGN_DSOUSC','DT_SIGNER_CTR_CRED', 'BARCODE','period']
all_columns2 = ['id_projet', 'Nombre_pret', 'CA_REMISE_T', 'situ_globale','type_residence_1','NB_PARTENAIRES','LI_DESTN_FINCT',
'nb_mois_dernier_entretien', 'type_ptf_cc', 'PNB', 'TP_VU_M12','NB_CONX_BAM_12M', 'NB_CONX_WEB_12M',
'MT_ENC_EP_BILAN','MT_ENC_COLLECTE','NB_MVT_DBTR__DAV_12M','SLD_COMPTA_DAV','SLD_MOY_DBIT_12_MOIS',
'SLD_MOY_CRDT_12_MOIS','N_TP_CREDIT_IMMO','TP_CREDIT_CONSO','N_TP_EPARGNE_ASS_VIE','N_TP_PROJET',
'TP_EPARGNE_BILANTIELLE','TP_GDECES', 'TP_PJ','TP_PROTECTION_PERSONNES','TP_AUTO','TP_PROTECTION_BIENS',
'TP_ASSURANCE', 'N_TP_CARTES', 'LI_REGRP_CSP','SEGMT','AGE','ANCIENNETE_MOIS','TX_CRED','DUREE_RESTANTE',
'MT_ACCORDE_PRET_PAR_CR','MT_PROCH_ECHCE_THRQ','MT_VERSE_EPRGN_DSOUSC','DT_SIGNER_CTR_CRED', 'BARCODE']
data4 = data2[all_columns2]
data4.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 204534 entries, 0 to 204533 Data columns (total 42 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id_projet 204534 non-null object 1 Nombre_pret 204534 non-null int64 2 CA_REMISE_T 204534 non-null float64 3 situ_globale 204534 non-null object 4 type_residence_1 204534 non-null object 5 NB_PARTENAIRES 204534 non-null float64 6 LI_DESTN_FINCT 204534 non-null object 7 nb_mois_dernier_entretien 204534 non-null float64 8 type_ptf_cc 204534 non-null object 9 PNB 204534 non-null float64 10 TP_VU_M12 204534 non-null float64 11 NB_CONX_BAM_12M 204534 non-null float64 12 NB_CONX_WEB_12M 204534 non-null float64 13 MT_ENC_EP_BILAN 204534 non-null float64 14 MT_ENC_COLLECTE 204534 non-null float64 15 NB_MVT_DBTR__DAV_12M 204534 non-null float64 16 SLD_COMPTA_DAV 204534 non-null float64 17 SLD_MOY_DBIT_12_MOIS 204534 non-null float64 18 SLD_MOY_CRDT_12_MOIS 204534 non-null float64 19 N_TP_CREDIT_IMMO 204534 non-null float64 20 TP_CREDIT_CONSO 204534 non-null float64 21 N_TP_EPARGNE_ASS_VIE 204534 non-null float64 22 N_TP_PROJET 204534 non-null float64 23 TP_EPARGNE_BILANTIELLE 204534 non-null float64 24 TP_GDECES 204534 non-null float64 25 TP_PJ 204534 non-null float64 26 TP_PROTECTION_PERSONNES 204534 non-null float64 27 TP_AUTO 204534 non-null float64 28 TP_PROTECTION_BIENS 204534 non-null float64 29 TP_ASSURANCE 204534 non-null float64 30 N_TP_CARTES 204534 non-null float64 31 LI_REGRP_CSP 204534 non-null object 32 SEGMT 204534 non-null object 33 AGE 204534 non-null float64 34 ANCIENNETE_MOIS 204534 non-null float64 35 TX_CRED 204534 non-null float64 36 DUREE_RESTANTE 204458 non-null float64 37 MT_ACCORDE_PRET_PAR_CR 204534 non-null float64 38 MT_PROCH_ECHCE_THRQ 204534 non-null float64 39 MT_VERSE_EPRGN_DSOUSC 204534 non-null float64 40 DT_SIGNER_CTR_CRED 204534 non-null int64 41 BARCODE 204534 non-null int64 dtypes: float64(32), int64(3), object(7) memory usage: 65.5+ MB
data3 = data3[all_columns3]
filepath4 = r'D:\ECM\DigitalLab\Projet_2\final_data\final_data.csv'
data4.to_csv(filepath4)
filepath3 = r'D:\ECM\DigitalLab\Projet_2\final_data\data_analysis.csv'
data3.to_csv(filepath3)
import plotly.io as pio
pio.kaleido.scope.default_format = "svg"
color = ['rgb(26, 140, 255)', 'rgb(77, 77, 255)', 'rgb(0, 0, 179)']
fig = px.histogram(data3, x="AGE", color="BARCODE",
marginal="box", color_discrete_sequence=color)
fig.show()
fig = px.box(data3, x="period", y='CA_REMISE_T', color="BARCODE", color_discrete_sequence=color)
fig.show()
fig = px.box(data3, x="period", y='MT_ACCORDE_PRET_PAR_CR', color="BARCODE", color_discrete_sequence=color)
fig.show()
columns4 = ['NB_CONX_WEB_12M','SLD_MOY_CRDT_12_MOIS','PNB', 'AGE','ANCIENNETE_MOIS','TX_CRED','DUREE_RESTANTE',
'MT_ACCORDE_PRET_PAR_CR', 'CA_REMISE_T']
fig = px.box(data3, x="period", y='ANCIENNETE_MOIS', color="BARCODE", color_discrete_sequence=color)
fig.show()
fig = px.box(data3, x="period", y='NB_CONX_WEB_12M', color="BARCODE", color_discrete_sequence=color)
fig.show()
fig = px.box(data3, x="period", y='DUREE_RESTANTE', color="BARCODE", color_discrete_sequence=color)
fig.show()
fig = px.box(data3, x="period", y='PNB', color="BARCODE", color_discrete_sequence=color)
fig.show()
x,y = 'BARCODE', 'situ_globale'
df1 = data3.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
df1['percent']=df1['percent'].apply(lambda x:round(x,2))
df1[x] = df1[x].astype(str)
color = ['rgb(26, 140, 255)', 'rgb(77, 77, 255)', 'rgb(0, 0, 179)', 'rgb(128, 183, 255)']
fig = px.bar(df1, x=x, y='percent', color = y, text = 'percent', color_discrete_sequence=color)
fig.show()
x,y = 'BARCODE', 'LI_REGRP_CSP'
df1 = data3.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
df1['percent']=df1['percent'].apply(lambda x:round(x,2))
df1[x] = df1[x].astype(str)
color = ['rgb(26, 140, 255)', 'rgb(77, 77, 255)', 'rgb(0, 0, 179)', 'rgb(128, 183, 255)']
fig = px.bar(df1, x=x, y='percent', color = y, text = 'percent', color_discrete_sequence=color)
fig.show()
x,y = 'BARCODE', 'TP_ASSURANCE'
df1 = data3.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
df1['percent']=df1['percent'].apply(lambda x:round(x,2))
df1[x] = df1[x].astype(str)
df1[y] = df1[y].astype(str)
color = ['rgb(26, 140, 255)', 'rgb(77, 77, 255)', 'rgb(0, 0, 179)', 'rgb(128, 183, 255)']
fig = px.bar(df1, x=x, y='percent', color = y, text = 'percent', color_discrete_sequence=color)
fig.show()
x,y = 'BARCODE', 'TP_PROTECTION_PERSONNES'
df1 = data3.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
df1['percent']=df1['percent'].apply(lambda x:round(x,2))
df1[x] = df1[x].astype(str)
df1[y] = df1[y].astype(str)
color = ['rgb(26, 140, 255)', 'rgb(77, 77, 255)', 'rgb(0, 0, 179)', 'rgb(128, 183, 255)']
fig = px.bar(df1, x=x, y='percent', color = y, text = 'percent', color_discrete_sequence=color)
fig.show()
x,y = 'BARCODE', 'TP_PROTECTION_BIENS'
df1 = data3.groupby(x)[y].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
df1['percent']=df1['percent'].apply(lambda x:round(x,2))
df1[x] = df1[x].astype(str)
df1[y] = df1[y].astype(str)
color = ['rgb(26, 140, 255)', 'rgb(77, 77, 255)', 'rgb(0, 0, 179)', 'rgb(128, 183, 255)']
fig = px.bar(df1, x=x, y='percent', color = y, text = 'percent', color_discrete_sequence=color)
fig.show()